1 Setup

setwd("/mnt/picea/projects/aspseq/jfelten/T89-Laccaria-bicolor")
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(vsn))
source("~/Git/UPSCb/src/R/plot.multidensity.R")
source("~/Git/UPSCb/src/R/featureSelection.R")
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/UPSCb/projects/T89-Laccaria-bicolor/doc/Samples.csv") %>% 
  mutate(Time=factor(Time)) %>% 
  mutate(Experiment=factor(Experiment))
## Parsed with column specification:
## cols(
##   SciLifeID = col_character(),
##   SampleName = col_double(),
##   Time = col_double(),
##   Experiment = col_character(),
##   Replicate = col_character()
## )

2 Laccaria bicolor data

2.1 Raw data

counts <- suppressWarnings(read_csv("January/analysis/salmon/Lacbi-raw-unormalised-gene-expression_data.csv") %>% 
  column_to_rownames("X1") + 
  read_csv("analysis/salmon/Lacbi-raw-unormalised-gene-expression_data.csv") %>% 
  column_to_rownames("X1"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.

2.2 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "22.7% percent (5172) of 22822 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(colnames(counts),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 5172 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 269828 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 269828 rows containing non-finite values (stat_density).

2.3 Export

dir.create(file.path("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="analysis/salmon/Lacbi-all-raw-unormalised-gene-expression_data.csv")

2.4 Data normalisation

2.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="analysis/salmon/Lacbi-all-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is relatively variable (40 to 240 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_101 P11915_102 P11915_103 P11915_104 P11915_105 P11915_106
0.9527 2.379 2.111 1.024 1.194 1.415
Table continues below
P11915_107 P11915_112 P11915_113 P11915_114 P11915_115 P11915_116
1.443 1.303 0.6593 0.6641 1.116 1.105
Table continues below
P11915_117 P11915_118 P11915_123 P11915_124 P11915_125 P11915_126
1.4 1.119 0.9421 0.7283 1.344 0.5157
Table continues below
P11915_127 P11915_128 P11915_129 P11915_134 P11915_135 P11915_136
0.65 0.8961 0.9009 1.401 1.027 1
Table continues below
P11915_137 P11915_138 P11915_139 P11915_140 P11915_145 P11915_146
0.4518 1.064 0.8489 0.9296 1.737 1.297
P11915_147 P11915_148 P11915_149 P11915_150 P11915_151
1.605 0.7587 0.7511 0.4468 0.9132
boxplot(sizes, main="Sequencing libraries size factor")

2.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked very well, the data is nearly homoscesdastic

meanSdPlot(vst[rowSums(counts)>0,])

2.6 QC on the normalised data

2.6.1 PCA

pc <- prcomp(t(vst))
  
percent <- round(summary(pc)$importance[2,]*100)

2.6.2 3 first dimensions

This looks interesting as the sample separate clearly both by Experiment and Time in the first 2 components.

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]-1])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-1])

par(mar=mar)

2.6.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment,text=SciLifeID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

2.6.4 Heatmap

Filter for noise A cutoff at a VST value of 1 leaves about 15000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                   conditions=conds,
                   nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples.
heatmap.2(t(scale(t(vst[sels[[2]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering remains the same even for the most variable genes

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes shows a separation by experiment and time point

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

2.7 Conclusion

The quality of the data is good. The PCA shows that the samples cluster by experiment and time, however, the heatmap shows a clustering that correlates with the mapping rates, i.e. the mixed amount of reads originating from either organism. The final heatmap seem to indicate that this effect is neglectable albeit confounded.

3 T89 ( Populus tremula x Populus tremuloides ) data

3.1 Raw data

counts <- suppressWarnings(read_csv("January/analysis/salmon/Potra-raw-unormalised-gene-expression_data.csv") %>% 
                             column_to_rownames("X1") + 
                             read_csv("analysis/salmon/Potra-raw-unormalised-gene-expression_data.csv") %>% 
                             column_to_rownames("X1"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.

3.2 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "11.6% percent (3935) of 34043 genes are not expressed"

The reads are mostly from the Cont samples. In the ECM samples, the majority of the reads is of fungal origin

ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(colnames(counts),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 3935 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 433688 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 433688 rows containing non-finite values (stat_density).

3.3 Export

dir.create(file.path("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="analysis/salmon/Potra-all-raw-unormalised-gene-expression_data.csv")

3.4 Data normalisation

3.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="analysis/salmon/Potra-all-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is highly variable (10 to 500 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_104 P11915_105 P11915_106 P11915_107 P11915_108 P11915_109
0.09004 0.2524 0.1799 0.3481 2.86 2.21
Table continues below
P11915_110 P11915_111 P11915_115 P11915_116 P11915_117 P11915_118
2.183 1.316 0.1794 0.2533 0.1347 0.2124
Table continues below
P11915_119 P11915_120 P11915_121 P11915_122 P11915_126 P11915_127
2.263 2.94 2.474 2.13 1.101 0.2189
Table continues below
P11915_128 P11915_129 P11915_130 P11915_131 P11915_132 P11915_133
0.6912 0.5727 3.285 2.003 2.659 3.163
Table continues below
P11915_137 P11915_138 P11915_139 P11915_140 P11915_141 P11915_142
1.186 0.5998 0.199 0.258 2.13 4.864
Table continues below
P11915_143 P11915_144 P11915_148 P11915_149 P11915_150 P11915_151
3.768 1.938 1.634 0.5845 1.462 0.8478
P11915_152 P11915_153 P11915_154 P11915_155
2.605 4.047 5.025 2.015
boxplot(sizes, main="Sequencing libraries size factor")

3.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(counts)>0,])

3.6 QC on the normalised data

3.6.1 PCA

pc <- prcomp(t(vst))

percent <- round(summary(pc)$importance[2,]*100)

3.6.2 3 first dimensions

This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-3])

par(mar=mar)

3.6.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment,text=SciLifeID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

3.6.4 Heatmap

Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples, but it is not as striking as for Laccaria bicolor. There is clustering by time, especially for the earlier time points. The later ECM time points are closer to the earlier ECM samples, while the later control cluster together, apart from some outliers.
heatmap.2(t(scale(t(vst[sels[[3]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering shows a better clustering by time and experiment, even though there are still outliers

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes is very similar to the overall profile

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

3.7 Conclusion

The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.

These results suggests that a few samples could be removed as being under-sequenced

4 T89 ( Populus tremula x Populus tremuloides ) data with 3 samples removed

We removed samples P11915_104, P11915_127 and P11915_139 as having too few counts compared to their replicates

rm.sel <- colnames(counts) %in% c("P11915_104","P11915_127","P11915_139")
pander(colSums(counts)[rm.sel])
P11915_104 P11915_127 P11915_139
129865 327104 315206
boxplot(list(removed=colSums(counts)[rm.sel],kept=colSums(counts)[!rm.sel]))

counts <- counts[,!rm.sel]

4.1 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "11.6% percent (3951) of 34043 genes are not expressed"

The reads are mostly from the Cont samples. In the ECM samples, the majority of the reads is of fungal origin

ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(colnames(counts),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 3951 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

p <- ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")

ggplotly(p)
## Warning: Removed 386051 rows containing non-finite values (stat_density).

Color by Time

p <- ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")

ggplotly(p)
## Warning: Removed 386051 rows containing non-finite values (stat_density).

4.2 Export

dir.create(file.path("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="analysis/salmon/Potra-104-127-139-removed-raw-unormalised-gene-expression_data.csv")

4.3 Data normalisation

4.3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="analysis/salmon/Potra-104-127-139-removed-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is highly variable (15 to 420 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_105 P11915_106 P11915_107 P11915_108 P11915_109 P11915_110
0.2151 0.1545 0.2963 2.469 1.901 1.886
Table continues below
P11915_111 P11915_115 P11915_116 P11915_117 P11915_118 P11915_119
1.125 0.1532 0.2175 0.1161 0.181 1.951
Table continues below
P11915_120 P11915_121 P11915_122 P11915_126 P11915_128 P11915_129
2.508 2.118 1.831 0.9529 0.5889 0.4883
Table continues below
P11915_130 P11915_131 P11915_132 P11915_133 P11915_137 P11915_138
2.857 1.722 2.283 2.76 1.02 0.5047
Table continues below
P11915_140 P11915_141 P11915_142 P11915_143 P11915_144 P11915_148
0.2169 1.848 4.2 3.243 1.689 1.42
Table continues below
P11915_149 P11915_150 P11915_151 P11915_152 P11915_153 P11915_154
0.5044 1.256 0.7182 2.255 3.538 4.329
P11915_155
1.741
boxplot(sizes, main="Sequencing libraries size factor")

4.4 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(counts)>0,])

4.5 QC on the normalised data

4.5.1 PCA

pc <- prcomp(t(vst))

percent <- round(summary(pc)$importance[2,]*100)

4.5.2 3 first dimensions

This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-3])

par(mar=mar)

4.5.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment,text=SciLifeID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

4.5.4 Heatmap

Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the clustering is not significantly improved compared to using all samples.
heatmap.2(t(scale(t(vst[sels[[3]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is slightly enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering shows a perfect clustering by experiment

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

Plotting the same using the SciLife IDs instead

The 3 samples clustering together ECM 14, 28 and 28 (126, 148, 150) are those that are the most deeply sequenced. So there is a bias there. However, sample 137 (equally deeply sequenced) clusters with the other ECM 14 and 21 samples.

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = samples$SciLifeID[s.sel],
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes is very similar to the overall profile in the sense that it shows increased variation in the clustering pattern

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

4.6 Conclusion

Using the ECM data for P. tremula x P. tremuloides in comparisons to Cont is going to be sub-optimal. Results will be biased by the lack of sequencing depth especially at the early time points. There still seem to be informative signal in the data, but the results will be partial. Luckily, this will only affect the TPR, i.e. we will miss on some real effects, but the FDR will not be affected

5 Populus trichocarpa data

5.1 Raw data

counts <- suppressWarnings(read_csv("January/analysis/salmon/Potri-raw-unormalised-gene-expression_data.csv") %>% 
                             column_to_rownames("X1") + 
                             read_csv("analysis/salmon/Potri-raw-unormalised-gene-expression_data.csv") %>% 
                             column_to_rownames("X1"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.

5.2 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "16.5% percent (6807) of 41270 genes are not expressed"

The reads are mostly from the Cont samples. In the ECM samples, the majority of the reads is of fungal origin

ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(colnames(counts),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 6807 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 671471 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 671471 rows containing non-finite values (stat_density).

5.3 Export

dir.create(file.path("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="analysis/salmon/Potri-all-raw-unormalised-gene-expression_data.csv")

5.4 Data normalisation

5.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="analysis/salmon/Potri-all-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is highly variable (10 to 500 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_104 P11915_105 P11915_106 P11915_107 P11915_108 P11915_109
0.0896 0.2524 0.1809 0.3476 2.87 2.213
Table continues below
P11915_110 P11915_111 P11915_115 P11915_116 P11915_117 P11915_118
2.183 1.325 0.1816 0.2536 0.1359 0.2119
Table continues below
P11915_119 P11915_120 P11915_121 P11915_122 P11915_126 P11915_127
2.245 2.969 2.478 2.144 1.106 0.2158
Table continues below
P11915_128 P11915_129 P11915_130 P11915_131 P11915_132 P11915_133
0.6965 0.575 3.251 2.015 2.668 3.139
Table continues below
P11915_137 P11915_138 P11915_139 P11915_140 P11915_141 P11915_142
1.223 0.6112 0.1993 0.2585 2.099 4.847
Table continues below
P11915_143 P11915_144 P11915_148 P11915_149 P11915_150 P11915_151
3.82 1.902 1.632 0.5881 1.465 0.8566
P11915_152 P11915_153 P11915_154 P11915_155
2.564 3.973 5.014 1.996
boxplot(sizes, main="Sequencing libraries size factor")

5.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(counts)>0,])

5.6 QC on the normalised data

5.6.1 PCA

pc <- prcomp(t(vst))

percent <- round(summary(pc)$importance[2,]*100)

5.6.2 3 first dimensions

This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-3])

par(mar=mar)

5.6.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment,text=SciLifeID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

5.6.4 Heatmap

Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples, but it is not as striking as for Laccaria bicolor. There is clustering by time, especially for the earlier time points. The later ECM time points are closer to the earlier ECM samples, while the later control cluster together, apart from some outliers.
heatmap.2(t(scale(t(vst[sels[[3]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering shows a better clustering by time and experiment, even though there are still outliers

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes is very similar to the overall profile

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

5.7 Conclusion

The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.

These results suggests that a few samples could be removed as being under-sequenced

6 Session Info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.52.0                  forcats_0.4.0              
##  [3] stringr_1.4.0               dplyr_0.8.3                
##  [5] purrr_0.3.2                 readr_1.3.1                
##  [7] tidyr_0.8.3                 tibble_2.1.3               
##  [9] tidyverse_1.2.1             scatterplot3d_0.3-41       
## [11] RColorBrewer_1.1-2          plotly_4.9.0               
## [13] pander_0.6.3                LSD_4.0-0                  
## [15] hyperSpec_0.99-20180627     ggplot2_3.2.1              
## [17] lattice_0.20-38             gplots_3.0.1.1             
## [19] DESeq2_1.24.0               SummarizedExperiment_1.14.1
## [21] DelayedArray_0.10.0         BiocParallel_1.18.1        
## [23] matrixStats_0.54.0          Biobase_2.44.0             
## [25] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
## [27] IRanges_2.18.2              S4Vectors_0.22.0           
## [29] BiocGenerics_0.30.0         data.table_1.12.2          
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1       htmlTable_1.13.1       XVector_0.24.0        
##   [4] base64enc_0.1-3        rstudioapi_0.10        hexbin_1.27.3         
##   [7] affyio_1.54.0          bit64_0.9-7            AnnotationDbi_1.46.1  
##  [10] lubridate_1.7.4        xml2_1.2.2             splines_3.6.1         
##  [13] geneplotter_1.62.0     knitr_1.24             zeallot_0.1.0         
##  [16] Formula_1.2-3          jsonlite_1.6           broom_0.5.2           
##  [19] annotate_1.62.0        cluster_2.1.0          shiny_1.3.2           
##  [22] BiocManager_1.30.4     compiler_3.6.1         httr_1.4.1            
##  [25] backports_1.1.4        assertthat_0.2.1       Matrix_1.2-17         
##  [28] lazyeval_0.2.2         limma_3.40.6           cli_1.1.0             
##  [31] later_0.8.0            acepack_1.4.1          htmltools_0.3.6       
##  [34] tools_3.6.1            affy_1.62.0            gtable_0.3.0          
##  [37] glue_1.3.1             GenomeInfoDbData_1.2.1 reshape2_1.4.3        
##  [40] Rcpp_1.0.2             cellranger_1.1.0       vctrs_0.2.0           
##  [43] preprocessCore_1.46.0  gdata_2.18.0           nlme_3.1-141          
##  [46] crosstalk_1.0.0        xfun_0.9               testthat_2.2.1        
##  [49] rvest_0.3.4            mime_0.7               gtools_3.8.1          
##  [52] XML_3.98-1.20          zlibbioc_1.30.0        scales_1.0.0          
##  [55] promises_1.0.1         hms_0.5.1              yaml_2.2.0            
##  [58] memoise_1.1.0          gridExtra_2.3          rpart_4.1-15          
##  [61] latticeExtra_0.6-28    stringi_1.4.3          RSQLite_2.1.2         
##  [64] highr_0.8              genefilter_1.66.0      checkmate_1.9.4       
##  [67] caTools_1.17.1.2       rlang_0.4.0            pkgconfig_2.0.2       
##  [70] bitops_1.0-6           evaluate_0.14          labeling_0.3          
##  [73] htmlwidgets_1.3        bit_1.1-14             tidyselect_0.2.5      
##  [76] plyr_1.8.4             magrittr_1.5           R6_2.4.0              
##  [79] generics_0.0.2         Hmisc_4.2-0            DBI_1.0.0             
##  [82] pillar_1.4.2           haven_2.1.1            foreign_0.8-72        
##  [85] withr_2.1.2            survival_2.44-1.1      RCurl_1.95-4.12       
##  [88] nnet_7.3-12            modelr_0.1.5           crayon_1.3.4          
##  [91] KernSmooth_2.23-15     rmarkdown_1.15         locfit_1.5-9.1        
##  [94] readxl_1.3.1           blob_1.2.0             digest_0.6.20         
##  [97] xtable_1.8-4           httpuv_1.5.1           munsell_0.5.0         
## [100] viridisLite_0.3.0